{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#基于用户的协同过滤算法\n",
    "import os\n",
    "import numpy as np\n",
    "#import matplotlib.pyplot as plt\n",
    "#import seaborn as sbn\n",
    "import pandas as pd\n",
    "#正常显示中文标签和负号\n",
    "#plt.rcParams['font.sans-serif'] = ['SimHei']  #用来正常显示中文标签\n",
    "#plt.rcParams['axes.unicode_minus'] = False  #用来正常显示负号\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#输入文件\n",
    "DATA_PATH = os.path.join(os.getcwd(),'data','input')\n",
    "TRAIN = os.path.join(DATA_PATH,'train.csv')\n",
    "TEST = os.path.join(DATA_PATH,'test.csv')\n",
    "SONG =  os.path.join(DATA_PATH,'song.csv')\n",
    "MEMBER =  os.path.join(DATA_PATH,'member.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#读取文件\n",
    "train = pd.read_csv(TRAIN)\n",
    "test = pd.read_csv(TEST)\n",
    "#member = pd.read_csv(MEMBER)\n",
    "#song = pd.read_csv(SONG)\n",
    "train = train[['msno','song_id','target']]\n",
    "test = test[['msno','song_id','target']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#计算用户和电影数量\n",
    "msno_set = set(train['msno'])|set(test['msno'])\n",
    "song_set = set(train['song_id'])|set(test['song_id'])\n",
    "n_users = len(msno_set)\n",
    "n_items = len(song_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>16135</td>\n",
       "      <td>24598</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>13696</td>\n",
       "      <td>9154</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>187</td>\n",
       "      <td>8457</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7843</td>\n",
       "      <td>28248</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4527</td>\n",
       "      <td>12389</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    msno  song_id  target\n",
       "0  16135    24598       1\n",
       "1  13696     9154       0\n",
       "2    187     8457       0\n",
       "3   7843    28248       1\n",
       "4   4527    12389       1"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#创建用户——音乐矩阵\n",
    "#因为又有测试数据，又有训练数据需要创建两个矩阵\n",
    "train_data_matrix = np.zeros((n_users,n_items))\n",
    "for line in train.itertuples():  #自己把每一行弄成tuple\n",
    "    train_data_matrix[line[1]-1,line[2]-1] = line[3]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "169"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del train,test,msno_set,song_set\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#使用sklearn的pairwise_distances计算余弦相似性。\n",
    "from sklearn.metrics.pairwise import pairwise_distances as paird\n",
    "user_similarity = paird(train_data_matrix,metric = 'cosine')\n",
    "#x先保存一个再算一下一个"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#下一步是预测，已经创建了相似性矩阵\n",
    "#下面基于相似性举证做预测\n",
    "#基于用户的评分可以根据用户均值评分\n",
    "#基于产品评分缺乏针对性\n",
    "def predict(ratings,similarity,type = 'user'):\n",
    "    if type == 'user':\n",
    "        mean_user_rating = ratings.mean(axis = 1)\n",
    "        #You use np.newaxis so that mean_user_rating has same format as ratings\n",
    "        ratings_diff = (ratings - mean_user_rating[:,np.newaxis])\n",
    "        pred = mean_user_rating[:,np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T\n",
    "    elif type == 'item':\n",
    "        pred = ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])     \n",
    "    return pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "ename": "MemoryError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-9-eb9831b90188>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtrain_user_prediction\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_data_matrix\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0muser_similarity\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mtype\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'user'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m<ipython-input-8-6fecce90bef7>\u001b[0m in \u001b[0;36mpredict\u001b[1;34m(ratings, similarity, type)\u001b[0m\n\u001b[0;32m      8\u001b[0m         \u001b[1;31m#You use np.newaxis so that mean_user_rating has same format as ratings\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m         \u001b[0mratings_diff\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mratings\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mmean_user_rating\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m         \u001b[0mpred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmean_user_rating\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0msimilarity\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mratings_diff\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m/\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msimilarity\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mT\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     11\u001b[0m     \u001b[1;32melif\u001b[0m \u001b[0mtype\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'item'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m         \u001b[0mpred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mratings\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msimilarity\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m/\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msimilarity\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mMemoryError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "train_user_prediction = predict(train_data_matrix,user_similarity,type = 'user')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv(TRAIN)\n",
    "train = train[['msno','song_id','target']]\n",
    "train['user_pre_target'] = 0\n",
    "for line in train.itertuples():  #自己把每一行弄成tuple\n",
    "    line[4] = user_prediction[line[1]-1,line[2]-1]\n",
    "#模型评价\n",
    "from sklearn import metrics\n",
    "from scipy import interp\n",
    "cm_train = metrics.confusion_matrix(train['target'],train['user_pre_target'])\n",
    "print(cm_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "198"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test = pd.read_csv(TEST)\n",
    "test = test[['msno','song_id','target']]\n",
    "test_data_matrix = np.zeros((n_users,n_items))\n",
    "for line in test.itertuples():\n",
    "    test_data_matrix[line[1]-1,line[2]-1] = line[3]\n",
    "test_user_prediction = predict(test_data_matrix,user_similarity,type = 'user')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#提取预测结果算概率\n",
    "train = pd.read_csv(TRAIN)\n",
    "test = pd.read_csv(TEST)\n",
    "#member = pd.read_csv(MEMBER)\n",
    "#song = pd.read_csv(SONG)\n",
    "train = train[['msno','song_id','target']]\n",
    "test = test[['msno','song_id','target']]\n",
    "test['user_pre_target'] = 0\n",
    "train['user_pre_target'] = 0\n",
    "for line in train.itertuples():  #自己把每一行弄成tuple\n",
    "    line[4] = user_prediction[line[1]-1,line[2]-1]\n",
    "for line in test.itertuples():\n",
    "    line[4] = user_prediction[line[1]-1,line[2]-1] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "205"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.savetxt(os.path.join(DATA_PATH,'user_similarity.txt'),user_similarity)\n",
    "del user_similarity\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "item_similarity = paird(train_data_matrix.T,metric = 'cosine')\n",
    "np.savetxt(os.path.join(DATA_PATH,'item_similarity.txt'),item_similarity)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'item_similarity' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-18-6a9c756e9a44>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mdel\u001b[0m \u001b[0mitem_similarity\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mgc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'item_similarity' is not defined"
     ]
    }
   ],
   "source": [
    "del item_similarity\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "ename": "MemoryError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-8-64b0ff3d6793>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m#基于用户相似性的预测\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0muser_similarity\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloadtxt\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mDATA_PATH\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'user_similarity.txt'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m#提取相似性文件\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mD:\\Users\\10100\\Anaconda3\\lib\\site-packages\\numpy\\lib\\npyio.py\u001b[0m in \u001b[0;36mloadtxt\u001b[1;34m(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin)\u001b[0m\n\u001b[0;32m   1030\u001b[0m             \u001b[0mfh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1031\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1032\u001b[1;33m     \u001b[0mX\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1033\u001b[0m     \u001b[1;31m# Multicolumn data are returned with shape (1, N, M), i.e.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1034\u001b[0m     \u001b[1;31m# (1, 1, M) for a single row - remove the singleton dimension there\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mMemoryError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "#基于用户相似性的预测\n",
    "#user_similarity = np.loadtxt(os.path.join(DATA_PATH,'user_similarity.txt'))  #提取相似性文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "item_prediction = predict(train_data_matrix, item_similarity, type='item')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#模型评价\n",
    "from sklearn import metrics\n",
    "from scipy import interp \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#模型评价\n",
    "from sklearn import metrics\n",
    "from scipy import interp \n",
    "cm_train = metrics.confusion_matrix(y_train,model_f1.predict(X_train).round())  #训练样本的混淆矩阵\n",
    "print(cm_train)\n",
    "cm_test = metrics.confusion_matrix(y_test,model_f1.predict(X_test).round())  #训练样本的混淆矩阵\n",
    "print(cm_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#使用模型的混合推荐"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:Anaconda3]",
   "language": "python",
   "name": "conda-env-Anaconda3-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
